import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download from datasets import load_dataset import json from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( COLS, AutoEvalColumn, fields, ) from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN from src.populate import get_leaderboard_df def restart_space(): API.restart_space(repo_id=REPO_ID) ### Space initialisation try: print(EVAL_REQUESTS_PATH) snapshot_download( repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN, ) except Exception: restart_space() dataset = load_dataset("dtcxzyw/llvm-apr-benchmark") total_issues = dataset.num_rows["test"] bug_id_to_time = dict() bug_id_by_cat = { "crash": [], "miscompilation": [], "hang": [], } for issue in dataset["test"]: bug_id_to_time[issue["bug_id"]] = pd.to_datetime(issue["knowledge_cutoff"]) bug_id_by_cat[issue["bug_type"]].append(issue["bug_id"]) timeline_xs = [] timeline_ys = [] timeline_cols = [] timeline_bugids = [] model_cnt = 0 for bug_id, time in bug_id_to_time.items(): timeline_xs.append(time) timeline_ys.append(0) timeline_cols.append("All") timeline_bugids.append(bug_id) cat_cnt = 4 for cat, bug_ids in bug_id_by_cat.items(): cat_cnt -= 1 for bug_id in bug_ids: timeline_xs.append(bug_id_to_time[bug_id]) timeline_ys.append(cat_cnt) timeline_cols.append(str(cat).capitalize()) timeline_bugids.append(bug_id) LEADERBOARD_DF = get_leaderboard_df(EVAL_REQUESTS_PATH, total_issues) for row in LEADERBOARD_DF.itertuples(): print(row) model_cnt += 1 for fix in row.fixed_bug_ids: timeline_xs.append(bug_id_to_time[fix]) timeline_ys.append(-model_cnt) timeline_cols.append(row.method_id) timeline_bugids.append(fix) timeline_df = pd.DataFrame( { "time": timeline_xs, "model": timeline_ys, "method_name": timeline_cols, "bug_id": timeline_bugids, } ) def init_leaderboard(dataframe): if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") return Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn)], select_columns=SelectColumns( default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], label="Select Columns to Display:", ), search_columns=[AutoEvalColumn.method_name.name], hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], filter_columns=[ ColumnFilter(AutoEvalColumn.with_hint.name, type="checkboxgroup", label="Hint"), ], bool_checkboxgroup_label="Hide models", interactive=False, ) demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT + f"\nTotal issues: {total_issues}\n", elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0): leaderboard = init_leaderboard(LEADERBOARD_DF[COLS]) gr.ScatterPlot( timeline_df, x="time", y="model", color="method_name", x_label="Time", y_label="Model", title="Timeline", y_lim=(-model_cnt - 1, 1), tooltip=["bug_id", "method_name", "time"], ) with gr.TabItem("🚀 Submission", elem_id="llm-benchmark-tab-table", id=1): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=6, elem_id="citation-button", show_copy_button=True, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.queue(default_concurrency_limit=40).launch()