import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download from datasets import load_dataset import json from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( COLS, AutoEvalColumn, fields, ) from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN from src.populate import get_leaderboard_df def restart_space(): API.restart_space(repo_id=REPO_ID) ### Space initialisation try: print(EVAL_REQUESTS_PATH) snapshot_download( repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN, ) dataset = load_dataset("dtcxzyw/llvm-apr-benchmark") except Exception: restart_space() total_issues = dataset.num_rows["test"] bug_id_to_time = dict() bug_id_to_type = dict() bug_id_by_cat = { "crash": [], "miscompilation": [], "hang": [], } bug_id_to_comp = dict() comp_bug_count = dict() for issue in dataset["test"]: bug_id_to_time[issue["bug_id"]] = pd.to_datetime(issue["knowledge_cutoff"]) bug_id_by_cat[issue["bug_type"]].append(issue["bug_id"]) bug_id_to_type[issue["bug_id"]] = issue["bug_type"] bug_id_to_comp[issue["bug_id"]] = issue["hints"]["components"] for comp in issue["hints"]["components"]: comp_bug_count[comp] = comp_bug_count.get(comp, 0) + 1 timeline_xs = [] timeline_ys = [] timeline_cols = [] timeline_bugids = [] model_cnt = 0 for bug_id, time in bug_id_to_time.items(): timeline_ys.append(0) timeline_cols.append("All") timeline_bugids.append(bug_id) cat_cnt = 4 for cat, bug_ids in bug_id_by_cat.items(): cat_cnt -= 1 for bug_id in bug_ids: timeline_ys.append(cat_cnt) timeline_cols.append(str(cat).capitalize()) timeline_bugids.append(bug_id) LEADERBOARD_DF = get_leaderboard_df(EVAL_REQUESTS_PATH, total_issues) fixed_bug_ids = set() fixed_bug_ids_fast = set() for row in LEADERBOARD_DF.itertuples(): print(row) model_cnt += 1 for fix in row.fixed_bug_ids: timeline_ys.append(-model_cnt) timeline_cols.append(row.method_id) timeline_bugids.append(fix) fixed_bug_ids.add(fix) for fix in row.fixed_bug_ids_fast: fixed_bug_ids_fast.add(fix) timeline_bugtypes = [] for bug_id in timeline_bugids: timeline_xs.append(bug_id_to_time[bug_id]) timeline_bugtypes.append(bug_id_to_type[bug_id]) timeline_df = pd.DataFrame( { "time": timeline_xs, "model": timeline_ys, "method_name": timeline_cols, "bug_id": timeline_bugids, "bug_type": timeline_bugtypes, } ) fixed_by_cat = dict() fixed_by_cat_fast = dict() for bug_id in fixed_bug_ids: fixed_by_cat[bug_id_to_type[bug_id]] = fixed_by_cat.get(bug_id_to_type[bug_id], 0) + 1 for bug_id in fixed_bug_ids_fast: fixed_by_cat_fast[bug_id_to_type[bug_id]] = fixed_by_cat_fast.get(bug_id_to_type[bug_id], 0) + 1 fixed_by_cat["All"] = len(fixed_bug_ids) bug_id_by_cat["All"] = [0] * total_issues fixed_by_cat_fast["All"] = len(fixed_bug_ids_fast) fixed_by_cat_df = pd.DataFrame( { "Category": [str(cat).capitalize() for cat in fixed_by_cat.keys()], "Total": [len(bug_id_by_cat[cat]) for cat in fixed_by_cat.keys()], "Repaired": list(fixed_by_cat.values()), "Repair Rate (%)": [ round(fixed_by_cat[cat] / len(bug_id_by_cat[cat]) * 100, 1) for cat in fixed_by_cat.keys() ], "Repaired (Fast)": [fixed_by_cat_fast.get(cat, 0) for cat in fixed_by_cat.keys()], "Repair Rate (Fast) (%)": [ round(fixed_by_cat_fast.get(cat, 0) / len(bug_id_by_cat[cat]) * 100, 1) for cat in fixed_by_cat.keys() ], } ) fixed_by_cat_df.sort_values("Total", inplace=True, ascending=False) fixed_by_comp = dict() for bug_id in fixed_bug_ids: for comp in bug_id_to_comp[bug_id]: fixed_by_comp[comp] = fixed_by_comp.get(comp, 0) + 1 fixed_by_comp_fast = dict() for bug_id in fixed_bug_ids_fast: for comp in bug_id_to_comp[bug_id]: fixed_by_comp_fast[comp] = fixed_by_comp_fast.get(comp, 0) + 1 fixed_by_comp_df = pd.DataFrame( { "Component": list(comp_bug_count.keys()), "Total": list(comp_bug_count.values()), "Repaired": [fixed_by_comp.get(comp, 0) for comp in comp_bug_count.keys()], "Repair Rate (%)": [ round(fixed_by_comp.get(comp, 0) / comp_bug_count[comp] * 100, 1) for comp in comp_bug_count.keys() ], "Repaired (Fast)": [fixed_by_comp_fast.get(comp, 0) for comp in comp_bug_count.keys()], "Repair Rate (Fast) (%)": [ round(fixed_by_comp_fast.get(comp, 0) / comp_bug_count[comp] * 100, 1) for comp in comp_bug_count.keys() ], } ) fixed_by_comp_df.sort_values("Total", inplace=True, ascending=False) def init_leaderboard(dataframe): if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") return Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn)], select_columns=SelectColumns( default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], label="Select Columns to Display:", ), search_columns=[AutoEvalColumn.method_name.name], hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], filter_columns=[ ColumnFilter(AutoEvalColumn.with_hint.name, type="checkboxgroup", label="Hint"), ], bool_checkboxgroup_label="Hide models", interactive=False, ) demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT + f"\nTotal issues: {total_issues}\n", elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0): leaderboard = init_leaderboard(LEADERBOARD_DF[COLS]) gr.ScatterPlot( timeline_df, x="time", y="model", color="method_name", x_label="Time", y_label="Model", title="Timeline", y_lim=(-model_cnt - 1, 4), tooltip=["bug_id", "method_name", "time", "bug_type"], ) gr.Dataframe(fixed_by_cat_df) gr.Dataframe(fixed_by_comp_df) with gr.TabItem("🚀 Submission", elem_id="llm-benchmark-tab-table", id=1): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=6, elem_id="citation-button", show_copy_button=True, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.queue(default_concurrency_limit=40).launch()