Spaces:

birder-project
/

leaderboard

Running

App Files Files Community

leaderboard / app.py

hassonofer

Update app.py

2daeddf verified 28 days ago

raw

history blame contribute delete

13.3 kB

	from pathlib import Path

	import altair as alt
	import polars as pl

	import gradio as gr

	DATASETS = []
	BENCHMARKS = {
	# Name: (device, AMP, compile, single thread)
	"Parameters": (None, None, None, None),
	"GPU Memory": (None, None, None, None),
	"CPU rate": ("cpu", False, False, False),
	"CPU rate single core": ("cpu", False, False, True),
	"CPU rate with compile": ("cpu", False, True, False),
	"CPU rate AMP with compile": ("cpu", True, True, False),
	"CUDA rate": ("cuda", False, False, False),
	"CUDA rate with compile": ("cuda", False, True, False),
	"CUDA rate AMP with compile": ("cuda", True, True, False),
	}


	def plot_acc_param(param_compare_results_df: pl.DataFrame, width: int = 1000, height: int = 680) -> alt.LayerChart:
	df = param_compare_results_df.select(
	"Model name",
	"Model type",
	"Accuracy",
	"Top-3 accuracy",
	"Resolution",
	"Parameters (M)",
	"Pareto frontier (p)",
	"Intermediate",
	"MIM",
	"Distilled",
	)
	base = df.plot.point(
	x="Parameters (M)",
	y="Accuracy",
	color="Model type",
	shape="Resolution:N",
	tooltip=[
	"Parameters (M)",
	"Accuracy",
	"Top-3 accuracy",
	"Model name",
	"Model type",
	"Resolution",
	"Intermediate",
	"MIM",
	"Distilled",
	],
	)
	text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
	frontier = df.plot.line(x="Parameters (M)", y="Pareto frontier (p)").mark_line(
	interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
	)

	chart = base + text + frontier
	return chart.properties(title="Accuracy vs Parameter Count", width=width, height=height).configure_scale(zero=False)


	def plot_acc_memory(memory_compare_results_df: pl.DataFrame, width: int = 900, height: int = 640) -> alt.LayerChart:
	if len(memory_compare_results_df) > 0:
	batch_size = memory_compare_results_df["max_batch_size"][0]
	amp = memory_compare_results_df["amp"][0]
	else:
	batch_size = ""
	amp = ""

	df = memory_compare_results_df.select(
	"Model name",
	"Model type",
	"Accuracy",
	"Top-3 accuracy",
	"Resolution",
	"Peak GPU memory (MB)",
	"Parameters (M)",
	"Pareto frontier (mem)",
	"Intermediate",
	"MIM",
	"Distilled",
	)
	base = df.plot.point(
	x="Peak GPU memory (MB)",
	y="Accuracy",
	color="Model type",
	shape="Resolution:N",
	tooltip=[
	"Peak GPU memory (MB)",
	"Parameters (M)",
	"Accuracy",
	"Top-3 accuracy",
	"Model name",
	"Model type",
	"Resolution",
	"Intermediate",
	"MIM",
	"Distilled",
	],
	)
	text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
	frontier = df.plot.line(x="Peak GPU memory (MB)", y="Pareto frontier (mem)").mark_line(
	interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
	)

	chart = base + text + frontier
	return chart.properties(
	title=f"Accuracy vs GPU Memory (batch size={batch_size}, amp={amp})", width=width, height=height
	).configure_scale(zero=False)


	def plot_acc_rate(rate_compare_results_df: pl.DataFrame, width: int = 1000, height: int = 680) -> alt.LayerChart:
	if len(rate_compare_results_df) > 0:
	device = rate_compare_results_df["device"][0]
	compiled = rate_compare_results_df["compile"][0]
	batch_size = rate_compare_results_df["max_batch_size"][0]
	amp = rate_compare_results_df["amp"][0]
	single_thread = rate_compare_results_df["single_thread"][0]
	else:
	device = ""
	compiled = ""
	batch_size = ""
	amp = ""
	single_thread = False

	df = rate_compare_results_df.select(
	"Model name",
	"Model type",
	"Accuracy",
	"Top-3 accuracy",
	"Resolution",
	"ms / sample",
	"Parameters (M)",
	"Pareto frontier (ms)",
	"Intermediate",
	"MIM",
	"Distilled",
	)
	base = df.plot.point(
	x="ms / sample",
	y="Accuracy",
	color="Model type",
	shape="Resolution:N",
	tooltip=[
	"ms / sample",
	"Parameters (M)",
	"Accuracy",
	"Top-3 accuracy",
	"Model name",
	"Model type",
	"Resolution",
	"Intermediate",
	"MIM",
	"Distilled",
	],
	)
	text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
	frontier = df.plot.line(x="ms / sample", y="Pareto frontier (ms)").mark_line(
	interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
	)

	chart = base + text + frontier

	if single_thread is True:
	single_thread_title = " Single Core"
	else:
	single_thread_title = ""

	return chart.properties(
	title=(
	f"Accuracy vs {device.upper()}{single_thread_title} Rate (compile={compiled}, "
	f"batch size={batch_size}, amp={amp})"
	),
	width=width,
	height=height,
	).configure_scale(zero=False)


	def update_data(
	dataset: str, benchmark: str, intermediate: bool, mim: bool, dist: bool, log_x: bool, search_bar: str
	) -> tuple[alt.LayerChart, pl.DataFrame]:
	compare_results_df = pl.read_csv(f"results_{dataset}.csv")
	if intermediate is False:
	compare_results_df = compare_results_df.filter(pl.col("Intermediate") == intermediate)
	if mim is False:
	compare_results_df = compare_results_df.filter(pl.col("MIM") == mim)
	if dist is False:
	compare_results_df = compare_results_df.filter(pl.col("Distilled") == dist)

	x_scale_type = "log" if log_x is True else "linear"

	# Filter models
	compare_results_df = compare_results_df.filter(pl.col("Model name").str.contains(search_bar))

	# Parameter count
	if benchmark == "Parameters":
	param_compare_results_df = compare_results_df.unique(subset=["Model name", "Resolution"]).sort(
	"Parameters (M)", descending=False
	)
	param_compare_results_df = param_compare_results_df.with_columns(
	pl.col("Accuracy").cum_max().alias("Pareto frontier (p)")
	)
	param_compare_results_df = param_compare_results_df.drop(
	"Samples / sec", "device", "ms / sample", "Peak GPU memory (MB)"
	)
	chart = plot_acc_param(param_compare_results_df)

	x_max = param_compare_results_df["Parameters (M)"].quantile(0.9)
	x_min = param_compare_results_df["Parameters (M)"].quantile(0.1)
	chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type)
	output_df = param_compare_results_df

	# Peak memory
	elif benchmark == "GPU Memory":
	memory_compare_results_df = compare_results_df.drop_nulls(subset=["Peak GPU memory (MB)"])
	memory_compare_results_df = memory_compare_results_df.unique(subset=["Model name", "Resolution"]).sort(
	"Peak GPU memory (MB)", descending=False
	)
	memory_compare_results_df = memory_compare_results_df.with_columns(
	pl.col("Accuracy").cum_max().alias("Pareto frontier (mem)")
	)
	memory_compare_results_df = memory_compare_results_df.drop("Samples / sec", "device", "ms / sample")
	chart = plot_acc_memory(memory_compare_results_df)
	x_max = memory_compare_results_df["Peak GPU memory (MB)"].quantile(0.9)
	x_min = memory_compare_results_df["Peak GPU memory (MB)"].quantile(0.1)
	chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type)
	output_df = memory_compare_results_df

	# Rate
	else:
	(device, amp_enabled, compiled, single_thread) = BENCHMARKS[benchmark]
	df = compare_results_df.drop_nulls(subset=["ms / sample"])
	df = df.filter(device=device, amp=amp_enabled, compile=compiled, single_thread=single_thread)
	device_compare_results_df = df.unique(subset=["Model name", "Resolution"]).sort("ms / sample", descending=False)
	device_compare_results_df = device_compare_results_df.drop("Peak GPU memory (MB)")
	device_compare_results_df = device_compare_results_df.with_columns(
	pl.col("Accuracy").cum_max().alias("Pareto frontier (ms)")
	)
	chart = plot_acc_rate(device_compare_results_df)

	x_max = device_compare_results_df["ms / sample"].quantile(0.95)
	x_min = device_compare_results_df["ms / sample"].min()
	if x_max is not None and x_min is not None:
	x_max = x_max * 1.04
	x_min = x_min * 0.96

	chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type)
	output_df = device_compare_results_df

	output_df = output_df.select(
	[
	pl.col(col).round(4) if output_df.schema[col] in [pl.Float32, pl.Float64] else col
	for col in output_df.columns
	]
	)

	return (chart, output_df.drop("Mistakes", "Samples", "torch_version"))


	def app() -> None:
	with gr.Blocks(title="Birder Leaderboard", analytics_enabled=False) as leaderboard:
	gr.HTML("<center><h1>The Birder Leaderboard</h1></center>")
	with gr.Row():
	with gr.Column():
	pass

	with gr.Column():
	gr.Markdown(
	"""
	Leaderboard of all the pre-trained Birder models across multiple datasets.

	### Benchmark Setup

	* GPU: A5000 ADA Generation
	* CPU: AMD Ryzen Threadripper PRO 7975WX
	* PyTorch version: 2.7.1+cu128

	### Dataset Information

	\| Name \| Training samples \| Validation samples \| Classes \|
	\|---------------------\|------------------\|--------------------\|-------------\|
	\| arabian-peninsula \| 583,868 \| 21,634 \| 735 \|
	\| eu-common \| 569,784 \| 19,869 \| 707 \|
	\| il-all \| 462,430 \| 18,621 \| 550 \|
	\| il-common \| 330,880 \| 15,828 \| 371 \|
	"""
	)

	with gr.Column():
	pass

	with gr.Row():
	with gr.Column():
	pass

	with gr.Column():
	dataset_dropdown = gr.Dropdown(
	choices=DATASETS,
	label="Select Dataset",
	value=DATASETS[0] if DATASETS else None,
	)
	benchmark_dropdown = gr.Dropdown(
	choices=BENCHMARKS.keys(),
	label="Select Benchmark",
	value=next(iter(BENCHMARKS.keys())) if BENCHMARKS else None,
	filterable=False,
	)

	with gr.Column():
	intermediate = gr.Checkbox(
	label="Intermediate",
	value=True,
	info="Show models that underwent intermediate training (extra data)",
	)
	mim = gr.Checkbox(label="MIM", value=True, info="Show models with Masked Image Modeling pre-training")
	dist = gr.Checkbox(label="Distilled", value=True, info="Show distilled models")
	log_x = gr.Checkbox(label="Log scale X-axis", value=False)

	with gr.Column():
	pass

	with gr.Row():
	with gr.Column():
	pass

	with gr.Column(scale=2):
	search_bar = gr.Textbox(label="Model Filter", placeholder="e.g. convnext, efficient\|mobile")

	with gr.Column():
	pass

	plot = gr.Plot(container=False)
	table = gr.Dataframe(show_search="search")

	inputs = [dataset_dropdown, benchmark_dropdown, intermediate, mim, dist, log_x, search_bar]
	outputs = [plot, table]
	leaderboard.load(update_data, inputs=inputs, outputs=outputs)

	dataset_dropdown.change(update_data, inputs=inputs, outputs=outputs)
	benchmark_dropdown.change(update_data, inputs=inputs, outputs=outputs)
	intermediate.change(update_data, inputs=inputs, outputs=outputs)
	mim.change(update_data, inputs=inputs, outputs=outputs)
	dist.change(update_data, inputs=inputs, outputs=outputs)
	log_x.change(update_data, inputs=inputs, outputs=outputs)
	search_bar.change(update_data, inputs=inputs, outputs=outputs)

	leaderboard.launch()


	# Launch the app
	if __name__ == "__main__":
	file_info = []
	for p in Path.glob(Path("."), "results_*.csv"):
	file_info.append((p.stat().st_size, p.stem.removeprefix("results_")))

	DATASETS = [dataset_name for _, dataset_name in sorted(file_info, key=lambda x: x[1], reverse=True)]

	app()